%load_ext autoreload
%load_ext nb_black
%autoreload 2
import sys; sys.path.append('../')
from pathlib import Path
import pandas as pd
import plotly.express as px
import plotly.io as pio
import torch
from sklearn.preprocessing import LabelEncoder
from torch.nn import MSELoss
from src.models.rnn.data import get_dataset
from src.models.rnn.model import RNNRatings
from src.models.rnn.trainer import Trainer
from src.util import metrics
from src.util.data import get_interactions, get_sparsity_factor, get_train_test_ratings
from src.util.discretizer import RatingDiscretizer
pio.renderers.default = "notebook"
DEVICE = torch.device("cpu")
if torch.cuda.is_available():
DEVICE = torch.device("cuda")
RATINGS_PATH = Path("../data/ratings_small.csv")
OUTPUT_PATH = Path("../models/rnn-ratings.pt")
ratings = pd.read_csv(RATINGS_PATH)
user_encoder = LabelEncoder()
user_encoder.fit(ratings["userId"].values)
movie_encoder = LabelEncoder()
movie_encoder.fit(ratings["movieId"].values)
LabelEncoder()
ratings["rating"] /= ratings["rating"].values.max()
train_ratings, test_ratings = get_train_test_ratings(ratings)
train_ratings = train_ratings.sort_values(by="timestamp", ascending=True)
test_ratings = test_ratings.sort_values(by="timestamp", ascending=True)
train_interactions = get_interactions(train_ratings, user_encoder, movie_encoder)
test_interactions = get_interactions(test_ratings, user_encoder, movie_encoder)
train_sparsity = get_sparsity_factor(train_interactions)
test_sparsity = get_sparsity_factor(test_interactions)
Building interaction matrix: 100%|██████████| 90309/90309 [00:00<00:00, 1964470.97it/s] Building interaction matrix: 100%|██████████| 9695/9695 [00:00<00:00, 1378947.31it/s]
print(f"Train sparsity: {(train_sparsity * 100):.3f}%")
print(f"Test sparsity: {(test_sparsity * 100):.3f}%")
Train sparsity: 1.485% Test sparsity: 0.159%
# ? binarization is used only to validate ranking metrics
rating_discretizer = RatingDiscretizer()
train_discretized_ratings = rating_discretizer.fit_transform(train_ratings)
test_discretized_ratings = rating_discretizer.transform(test_ratings)
train_ratings["userId"] = user_encoder.transform(train_ratings["userId"].values)
test_ratings["userId"] = user_encoder.transform(test_ratings["userId"].values)
train_ratings["movieId"] = movie_encoder.transform(train_ratings["movieId"].values)
test_ratings["movieId"] = movie_encoder.transform(test_ratings["movieId"].values)
model = RNNRatings(
train_interactions,
n_factors=10,
user_encoder=user_encoder,
movie_encoder=movie_encoder,
)
model.to(DEVICE)
trainer = Trainer(
loss=MSELoss(),
regularizers=[],
lr=1e-4,
weight_decay=1e-7,
epochs=100,
batch_size=1_000,
)
train_dataset = get_dataset(train_ratings, DEVICE)
test_dataset = get_dataset(test_ratings, DEVICE)
trainer.fit(model, train_dataset, test_dataset)
Training: 100%|██████████| 100/100 [07:17<00:00, 4.38s/it]
loss_history = trainer.get_loss_history()
fig = px.line(
loss_history,
x="epoch",
y="value",
color="loss",
title="Convergence",
labels={
"epoch": "Epochs",
"loss": "Loss",
"value": "MSE"
}
)
fig.show()
model.eval()
torch.save(model, OUTPUT_PATH)
with torch.no_grad():
mean_reciprocal_rank, reciprocal_ranks = metrics.mean_reciprocal_rank(
test_discretized_ratings,
model
)
Testing predictions: 100%|██████████| 656/656 [01:27<00:00, 7.49it/s]
with torch.no_grad():
mean_average_precision, average_precisions = metrics.mean_average_precision(
test_discretized_ratings,
model
)
Testing predictions: 100%|██████████| 656/656 [01:27<00:00, 7.49it/s]
with torch.no_grad():
mean_ndcg, ndcg_ranks = metrics.mean_ndcg(
test_discretized_ratings,
model
)
Testing predictions: 100%|██████████| 656/656 [01:27<00:00, 7.51it/s]
with torch.no_grad():
coverage = metrics.coverage(
test_discretized_ratings,
model
)
Testing predictions: 100%|██████████| 656/656 [01:27<00:00, 7.51it/s]
with torch.no_grad():
rmse = metrics.rmse(
test_discretized_ratings,
model
)
Testing predictions: 100%|██████████| 9695/9695 [00:08<00:00, 1136.18it/s]
print(f"Mean Reciprocal Rank: {(mean_reciprocal_rank * 100):.2f}%")
print(f"Mean Average Precision: {(mean_average_precision * 100):.2f}%")
print(f"Mean NDCG: {(mean_ndcg * 100):.2f}%")
print(f"Coverage: {(coverage * 100):.2f}%")
print(f"RMSE: {(rmse):.42f}")
Mean Reciprocal Rank: 69.83% Mean Average Precision: 65.45% Mean NDCG: 77.73% Coverage: 100.00% RMSE: 2.392498056593340916009537977515719830989838
fig = px.histogram(
x=reciprocal_ranks,
marginal="box",
title="Reciprocal Rank Distribution",
labels={
"x": "Reciprocal Rank"
},
)
fig.show()
fig = px.histogram(
x=average_precisions,
marginal="box",
title="Average Precision Distribution",
labels={
"x": "Average Precision"
},
)
fig.show()
fig = px.histogram(
x=ndcg_ranks,
marginal="box",
title="NDCG Score Distribution",
labels={
"x": "NDCG Score"
},
)
fig.show()